<?php

namespace StudyBuddy;

/**
 * Special class that represents html of one rss feed item
 * as an instance of DomDocument
 *
 * This class can also be used to load any submitted
 * html document like document we get after converting
 * MS Word or OpenOffice or Email in html format
 *
 * It's very useful if we need to extract the embedded images
 * and then we need to change the src attributes of such images
 * to point to out extracted/resized/saved images.
 *
 *
 * Being an instance of DomDocument makes it easy to
 * manipulate the html tags, like adding the nofollow to all links,
 * extracting all images, getting number of links in item
 * as well as doing some extra non-dom related manipulation
 * like adding our own links to it, etc.
 *
 */
class DomFeedItem extends \DOMDocument {

    /**
     * Number of links this element has
     * @var int
     */
    protected $intLinksCount = 0;

    /**
     * Number of image nodes this element has
     * @var int
     */
    protected $intImgCount = 0;

    /**
     * Array of extracted image nodes data
     * each element is an array
     * that contains the attribute name => value
     * example: array('src' => '/image/cool.jpg', 'width'=>'250px')
     *
     * @var array
     */
    protected $aImages = array();

    /**
     * Flag indicates to add rel=nofollow to all links
     *
     * @var bool
     */
    protected $bNofollow = true;

    /**
     * If img tags or link tags in the feed item
     * are not absolute (don't start with http)
     * then prefix them with this baseUri
     * but only if baseUri itself starts with http://
     *
     * @var string
     */
    public $baseUri = '';

    /* 	public function __construct($v = '1.0', $enc = 'UTF-8'){
      parent::__construct('1.0', 'UTF-8');

      $this->recover = true;
      }
     */

    /**
     * Getter for this->intLinksCount
     * @return int
     */
    public function getLinksCount() {

        d(' $this->intLinksCount: ' . $this->intLinksCount);

        return $this->intLinksCount;
    }

    /**
     * Getter for intImgCount
     * @return int
     */
    public function getImgCount() {

        return $this->intImgCount;
    }

    public function setNoFollow($bNoFollow = true) {
        $this->bNofollow = $bNoFollow;
    }

    /**
     * Factory method
     * Makes the object is this class
     * and load the html string, first wraps the html
     * string into the <div class="newsItem">
     *
     * @param object of type Utf8String $sHtml html string to load
     * usually this is the feed item from rss feed.
     * by being an object of type Utf8String it's guaranteed
     * to be an already in utf-8 charset
     *
     * @return object of this class
     *
     * @throws StudyBuddyDevException is unable to load the string
     */
    public static function loadFeedItem(Utf8String $oHtml, $sBaseUri = '', $bAddNoFollow = true, $parseCodeTags = true) {

        $oDom = new self('1.0', 'utf-8');
        $oDom->encoding = 'UTF-8';
        $oDom->preserveWhiteSpace = true;
        $oDom->recover = true;
        $oDom->setNofollow($bAddNoFollow);

        $sHtml = $oHtml->valueOf();

        /**
         * @todo
         * maybe we should add class to this div and
         * then in the getFeedItem() don't remove the div at all,
         * so it will always be part of feed item's html,
         * it's just going to wrap the entire item.
         * So when we add item to a page we know it will always be wrapped
         * in this additional div
         *
         */
        /**
         * Extremely important to add the
         * <META CONTENT="text/html; charset=utf-8">
         * This is the ONLY way to tell the DOM (more spefically
         * the libxml) that input is in utf-8 encoding
         * Whithout this the DOM will assume that input is in the
         * default ISO-8859-1 format and then
         * will try to recode it to utf8
         * essentially it will do its own conversion to utf8,
         * messing up the string because it's already in utf8 and does not
         * need converting
         *
         * IMPORTANT: we are also wrapping the whole string in <div>
         * so that it will be easy to get back just the contents of
         * the first div
         *
         */
        $sHtml = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
                      "http://www.w3.org/TR/REC-html40/loose.dtd">
			<head>
  			<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf-8">
			</head>
			<body><div>' . $sHtml . '</div></body></html>';

        $ER = error_reporting(0);
        if (false === @$oDom->loadHTML($sHtml)) {
            throw new DevException('Error. Unable to load html string: ' . $sHtml);
        }
        error_reporting($ER);


        /**
         * If $sBaseUrl begins with http
         * then set the $this->baseUri value to this value
         * and make sure it always ends with forward slash
         */
        if (!empty($sBaseUri) && ('http' === substr($sBaseUri, 0, 4)) && (strlen($sBaseUri) > 12)) {
            $oDom->baseUri = rtrim($sBaseUri, '/') . '/';
        }

        $oDom->setRelNofollow();

        if ($parseCodeTags) {
            $oDom->parseCodeTags();
        }

        $oDom->fixImgBaseUri(); //->getImages();


        return $oDom;
    }

    /**
     * Setter for this->baseUrk
     * @param string $s
     * @return object $this
     */
    public function setBaseUri($s) {
        $this->baseUri = $s;

        return $this;
    }

    /**
     * Add attributes rel="code" class="c"
     * to all 'code' tags
     * this way the html with code tags can be
     * parsed by syntax highlighter
     *
     * @return object $this
     */
    public function parseCodeTags() {
        $aCode = $this->getElementsByTagName('code');
        $numItems = $aCode->length;
        if (!$aCode || 0 == $numItems) {
            d('no code elements');

            return $this;
        }

        for ($i = 0; $i < $numItems; $i += 1) {
            $node = $aCode->item($i);

            /**
             * Remove em tags by doing this:
             * replace 'em' nodes with their text values
             */
            if (!$node->hasAttribute('rel')) {
                $node->setAttribute('rel', 'code');
                $node->setAttribute('class', 'c');
            }
        }

        return $this;
    }

    /**
     * Get HTML of the feedItem's root element,
     * which is <div class="newsItem">
     *
     * @return string HTML string, usually with html special
     * entities (like &nbsp;) replaced with special UTF8 XML entities
     *
     * @throws StudyBuddyDevException if document doesn't look
     * like its a feed item document (if does not have
     * the first element <div class="newsItem">
     *
     */
    public function getFeedItem() {

        $html = substr($this->saveXML($this->getElementsByTagName('div')->item(0)), 5, -6);

        return $html;
    }

    /**
     * Get all 'a' element child nodes
     * of this element
     *
     * @return mixed object nodeSet | null if no 'a' elements found
     */
    public function getAllLinks() {
        $aLinks = $this->getElementsByTagName('a');
        $this->intLinksCount = $aLinks->length;

        if (0 === $this->intLinksCount) {

            return null;
        }

        return $aLinks;
    }

    /**
     * Set attributes of ALL links (a nodes)
     * child nodes of this node to 'nofollow'
     * This is useful when parsing the external feed
     * and want to add rel="nofollow" to all links
     *
     * @return object $this
     */
    public function setRelNofollow($bSetTargetBlank = true) {
        if ($this->bNofollow) {
            if (null !== $aLinks = $this->getAllLinks()) {
                for ($i = 0; $i < $this->intLinksCount; $i += 1) {
                    if ($aLinks->item($i)->hasAttribute('href')) {
                        $aLinks->item($i)->setAttribute('rel', 'nofollow');
                        if ($bSetTargetBlank) {
                            $aLinks->item($i)->setAttribute('target', '_blank');
                        }
                    }
                }
            }
        }

        return $this;
    }

    /**
     * Find all image elements, then one by one
     * change their 'src' attribute to prefix it with $this->sBaseUri
     * if necessary
     *
     * @return object $this
     */
    public function fixImgBaseUri() {
        $Images = $this->getImages();

        if (empty($this->baseUri)) {

            return $this;
        }

        if (0 === $this->intImgCount) {

            return $this;
        }

        for ($i = 0; $i < $Images->length; $i += 1) {
            $src = $Images->item($i)->getAttribute('src');
            if (preg_match('/(http|ftp)([s]{0,1}):\/\//', $src)) {
                continue;
            } else {
                $src = ltrim($src, '/');
                $src = $this->baseUri . $src;
                $Images->item($i)->setAttribute('src', $src);
            }
        }


        return $this;
    }

    public function fixLinkBaseUri() {
        
    }

    /**
     * Get all img elements children of this element
     *
     * @param $bAsArray is set to true (default is false)
     * then return value is array where each element is array
     * of attribute name => attribute value of image
     * This can be useful if we want to extract all images
     * from the item, like if we want to resize it or
     * use it as thumbnail or something.
     *
     *
     * @return mixed nodeSet object of array of image nodes
     * or null if no image tags found
     *
     */
    public function getImages($bAsArray = false) {
        $Images = $this->getElementsByTagName('img');

        $this->intImgCount = $Images->length;

        if (0 === $this->intImgCount) {

            return null;
        }

        if (!$bAsArray) {
            return $Images;
        }


        for ($i = 0; $i < $this->intImgCount; $i += 1) {
            foreach ($Images->item($i)->attributes as $attrName => $attrNode) {
                if (!array_key_exists($i, $arrRes)) {
                    $this->aImages[$i] = array();
                }

                $this->aImages[$i][$attrName] = $Images->item($i)->getAttribute($attrName);
            }
        }

        return $this->aImages;
    }

    public function __toString() {
        return $this->getFeedItem();
    }

}
